/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 * NumericCleaner.java
 * Copyright (C) 2006-2012 University of Waikato, Hamilton, New Zealand
 */

package weka.filters.unsupervised.attribute;

import java.util.Collections;
import java.util.Enumeration;
import java.util.Vector;

import weka.core.Capabilities;
import weka.core.Capabilities.Capability;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Range;
import weka.core.Utils;
import weka.core.WeightedAttributesHandler;
import weka.core.WeightedInstancesHandler;
import weka.filters.SimpleStreamFilter;

/**
 * <!-- globalinfo-start --> A filter that 'cleanses' the numeric data from
 * values that are too small, too big or very close to a certain value, and sets
 * these values to a pre-defined default.
 * <p/>
 * <!-- globalinfo-end -->
 * 
 * <!-- options-start --> Valid options are:
 * <p/>
 * 
 * <pre>
 * -output-debug-info
 *  Turns on output of debugging information.
 * </pre>
 * 
 * <pre>
 * -min &lt;double&gt;
 *  The minimum threshold. (default -Double.MAX_VALUE)
 * </pre>
 * 
 * <pre>
 * -min-default &lt;double&gt;
 *  The minimum threshold below which values are replaced by the corresponding default.
 *  (default -Double.MAX_VALUE)
 * </pre>
 * 
 * <pre>
 * -max &lt;double&gt;
 *  The maximum threshold above which values are replaced by the corresponding default.
 *  (default Double.MAX_VALUE)
 * </pre>
 * 
 * <pre>
 * -max-default &lt;double&gt;
 *  The replacement for values larger than the maximum threshold.
 *  (default Double.MAX_VALUE)
 * </pre>
 * 
 * <pre>
 * -closeto &lt;double&gt;
 *  The value with respect to which closeness is determined. (default 0)
 * </pre>
 * 
 * <pre>
 * -closeto-default &lt;double&gt;
 *  The replacement for values that are too close to '-closeto'.
 *  (default 0)
 * </pre>
 * 
 * <pre>
 * -closeto-tolerance &lt;double&gt;
 *  The tolerance for testing whether a value is too close. (default 1E-6)
 * </pre>
 * 
 * <pre>
 * -decimals &lt;int&gt;
 *  The number of decimals to round to, -1 means no rounding at all.
 *  (default -1)
 * </pre>
 * 
 * <pre>
 * -R &lt;col1,col2,...&gt;
 *  The list of columns to cleanse, e.g., first-last or first-3,5-last.
 *  (default first-last)
 * </pre>
 * 
 * <pre>
 * -V
 *  Inverts the matching sense.
 * </pre>
 * 
 * <pre>
 * -include-class
 *  Whether to include the class in the cleansing.
 *  The class column will always be skipped if this flag is not
 *  present. (default no)
 * </pre>
 * 
 * <!-- options-end -->
 * 
 * @author fracpete (fracpete at waikato dot ac dot nz)
 * @version $Revision$
 */
public class NumericCleaner extends SimpleStreamFilter implements WeightedAttributesHandler, WeightedInstancesHandler {

    /** for serialization */
    private static final long serialVersionUID = -352890679895066592L;

    /** the minimum threshold */
    protected double m_MinThreshold = -Double.MAX_VALUE;

    /** the minimum default replacement value */
    protected double m_MinDefault = -Double.MAX_VALUE;

    /** the maximum threshold */
    protected double m_MaxThreshold = Double.MAX_VALUE;

    /** the maximum default replacement value */
    protected double m_MaxDefault = Double.MAX_VALUE;

    /** the number the values are checked for closeness to */
    protected double m_CloseTo = 0;

    /** the default replacement value for numbers "close-to" */
    protected double m_CloseToDefault = 0;

    /**
     * the tolerance distance, below which numbers are considered being "close-to"
     */
    protected double m_CloseToTolerance = 1E-6;

    /** Stores which columns to cleanse */
    protected Range m_Cols = new Range("first-last");

    /** whether to include the class attribute */
    protected boolean m_IncludeClass = false;

    /** the number of decimals to round to (-1 means no rounding) */
    protected int m_Decimals = -1;

    /**
     * Returns a string describing this filter.
     * 
     * @return a description of the filter suitable for displaying in the
     *         explorer/experimenter gui
     */
    @Override
    public String globalInfo() {
        return "A filter that 'cleanses' the numeric data from values that are too " + "small, too big or very close to a certain value, and sets " + "these values to a pre-defined default.";
    }

    /**
     * Returns an enumeration describing the available options.
     * 
     * @return an enumeration of all the available options.
     */
    @Override
    public Enumeration<Option> listOptions() {

        Vector<Option> result = new Vector<Option>(11);

        result.addElement(new Option("\tThe minimum threshold below which values are replaced by the corresponding default.\n" + "\t(default -Double.MAX_VALUE)", "min", 1, "-min <double>"));

        result.addElement(new Option("\tThe replacement for values smaller than the minimum threshold.\n" + "\t(default -Double.MAX_VALUE)", "min-default", 1, "-min-default <double>"));

        result.addElement(new Option("\tThe maximum threshold above which values are replaced by the corresponding default.\n" + "\t(default Double.MAX_VALUE)", "max", 1, "-max <double>"));

        result.addElement(new Option("\tThe replacement for values larger than the maximum threshold.\n" + "\t(default Double.MAX_VALUE)", "max-default", 1, "-max-default <double>"));

        result.addElement(new Option("\tThe value with respect to which closeness is determined. (default 0)", "closeto", 1, "-closeto <double>"));

        result.addElement(new Option("\tThe replacement for values that are too close to '-closeto'.\n" + "\t(default 0)", "closeto-default", 1, "-closeto-default <double>"));

        result.addElement(new Option("\tThe tolerance for testing whether a value is too close.\n" + "\t(default 1E-6)", "closeto-tolerance", 1, "-closeto-tolerance <double>"));

        result.addElement(new Option("\tThe number of decimals to round to, -1 means no rounding at all.\n" + "\t(default -1)", "decimals", 1, "-decimals <int>"));

        result.addElement(new Option("\tThe list of columns to cleanse, e.g., first-last or first-3,5-last.\n" + "\t(default first-last)", "R", 1, "-R <col1,col2,...>"));

        result.addElement(new Option("\tInverts the matching sense.", "V", 0, "-V"));

        result.addElement(new Option("\tWhether to include the class in the cleansing.\n" + "\tThe class column will always be skipped if this flag is not\n" + "\tpresent. (default no)", "include-class", 0, "-include-class"));

        result.addAll(Collections.list(super.listOptions()));

        return result.elements();
    }

    /**
     * Gets the current settings of the filter.
     * 
     * @return an array of strings suitable for passing to setOptions
     */
    @Override
    public String[] getOptions() {

        Vector<String> result = new Vector<String>(20);

        result.add("-min");
        result.add("" + m_MinThreshold);

        result.add("-min-default");
        result.add("" + m_MinDefault);

        result.add("-max");
        result.add("" + m_MaxThreshold);

        result.add("-max-default");
        result.add("" + m_MaxDefault);

        result.add("-closeto");
        result.add("" + m_CloseTo);

        result.add("-closeto-default");
        result.add("" + m_CloseToDefault);

        result.add("-closeto-tolerance");
        result.add("" + m_CloseToTolerance);

        result.add("-R");
        result.add("" + m_Cols.getRanges());

        if (m_Cols.getInvert()) {
            result.add("-V");
        }

        if (m_IncludeClass) {
            result.add("-include-class");
        }

        result.add("-decimals");
        result.add("" + getDecimals());

        Collections.addAll(result, super.getOptions());

        return result.toArray(new String[result.size()]);
    }

    /**
     * Parses a given list of options.
     * <p/>
     * 
     * <!-- options-start --> Valid options are:
     * <p/>
     *
     * <pre>
     * -output-debug-info
     *  Turns on output of debugging information.
     * </pre>
     *
     * <pre>
     * -min &lt;double&gt;
     *  The minimum threshold. (default -Double.MAX_VALUE)
     * </pre>
     *
     * <pre>
     * -min-default &lt;double&gt;
     *  The minimum threshold below which values are replaced by the corresponding default.
     *  (default -Double.MAX_VALUE)
     * </pre>
     *
     * <pre>
     * -max &lt;double&gt;
     *  The maximum threshold above which values are replaced by the corresponding default.
     *  (default Double.MAX_VALUE)
     * </pre>
     *
     * <pre>
     * -max-default &lt;double&gt;
     *  The replacement for values larger than the maximum threshold.
     *  (default Double.MAX_VALUE)
     * </pre>
     *
     * <pre>
     * -closeto &lt;double&gt;
     *  The value with respect to which closeness is determined. (default 0)
     * </pre>
     *
     * <pre>
     * -closeto-default &lt;double&gt;
     *  The replacement for values that are too close to '-closeto'.
     *  (default 0)
     * </pre>
     *
     * <pre>
     * -closeto-tolerance &lt;double&gt;
     *  The tolerance for testing whether a value is too close. (default 1E-6)
     * </pre>
     *
     * <pre>
     * -decimals &lt;int&gt;
     *  The number of decimals to round to, -1 means no rounding at all.
     *  (default -1)
     * </pre>
     *
     * <pre>
     * -R &lt;col1,col2,...&gt;
     *  The list of columns to cleanse, e.g., first-last or first-3,5-last.
     *  (default first-last)
     * </pre>
     *
     * <pre>
     * -V
     *  Inverts the matching sense.
     * </pre>
     *
     * <pre>
     * -include-class
     *  Whether to include the class in the cleansing.
     *  The class column will always be skipped if this flag is not
     *  present. (default no)
     * </pre>
     *
     * <!-- options-end -->
     * 
     * @param options the list of options as an array of strings
     * @throws Exception if an option is not supported
     */
    @Override
    public void setOptions(String[] options) throws Exception {

        String tmpStr = Utils.getOption("min", options);
        if (tmpStr.length() != 0) {
            setMinThreshold(Double.parseDouble(tmpStr));
        } else {
            setMinThreshold(-Double.MAX_VALUE);
        }

        tmpStr = Utils.getOption("min-default", options);
        if (tmpStr.length() != 0) {
            setMinDefault(Double.parseDouble(tmpStr));
        } else {
            setMinDefault(-Double.MAX_VALUE);
        }

        tmpStr = Utils.getOption("max", options);
        if (tmpStr.length() != 0) {
            setMaxThreshold(Double.parseDouble(tmpStr));
        } else {
            setMaxThreshold(Double.MAX_VALUE);
        }

        tmpStr = Utils.getOption("max-default", options);
        if (tmpStr.length() != 0) {
            setMaxDefault(Double.parseDouble(tmpStr));
        } else {
            setMaxDefault(Double.MAX_VALUE);
        }

        tmpStr = Utils.getOption("closeto", options);
        if (tmpStr.length() != 0) {
            setCloseTo(Double.parseDouble(tmpStr));
        } else {
            setCloseTo(0);
        }

        tmpStr = Utils.getOption("closeto-default", options);
        if (tmpStr.length() != 0) {
            setCloseToDefault(Double.parseDouble(tmpStr));
        } else {
            setCloseToDefault(0);
        }

        tmpStr = Utils.getOption("closeto-tolerance", options);
        if (tmpStr.length() != 0) {
            setCloseToTolerance(Double.parseDouble(tmpStr));
        } else {
            setCloseToTolerance(1E-6);
        }

        tmpStr = Utils.getOption("R", options);
        if (tmpStr.length() != 0) {
            setAttributeIndices(tmpStr);
        } else {
            setAttributeIndices("first-last");
        }

        setInvertSelection(Utils.getFlag("V", options));

        setIncludeClass(Utils.getFlag("include-class", options));

        tmpStr = Utils.getOption("decimals", options);
        if (tmpStr.length() != 0) {
            setDecimals(Integer.parseInt(tmpStr));
        } else {
            setDecimals(-1);
        }

        super.setOptions(options);

        Utils.checkForRemainingOptions(options);
    }

    /**
     * Returns the Capabilities of this filter.
     * 
     * @return the capabilities of this object
     * @see Capabilities
     */
    @Override
    public Capabilities getCapabilities() {
        Capabilities result = super.getCapabilities();
        result.disableAll();

        // attributes
        result.enableAllAttributes();
        result.enable(Capability.MISSING_VALUES);

        // class
        result.enableAllClasses();
        result.enable(Capability.MISSING_CLASS_VALUES);
        result.enable(Capability.NO_CLASS);

        return result;
    }

    /**
     * Determines the output format based on the input format and returns this. In
     * case the output format cannot be returned immediately, i.e.,
     * immediateOutputFormat() returns false, then this method will be called from
     * batchFinished().
     * 
     * @param inputFormat the input format to base the output format on
     * @return the output format
     * @throws Exception in case the determination goes wrong
     * @see #hasImmediateOutputFormat()
     * @see #batchFinished()
     */
    @Override
    protected Instances determineOutputFormat(Instances inputFormat) throws Exception {

        m_Cols.setUpper(inputFormat.numAttributes() - 1);

        return new Instances(inputFormat);
    }

    /**
     * processes the given instance (may change the provided instance) and returns
     * the modified version.
     * 
     * @param instance the instance to process
     * @return the modified data
     * @throws Exception in case the processing goes wrong
     */
    @Override
    protected Instance process(Instance instance) throws Exception {

        int i;
        double val;
        double factor;

        double[] result = new double[instance.numAttributes()];

        if (m_Decimals > -1) {
            factor = StrictMath.pow(10, m_Decimals);
        } else {
            factor = 1;
        }

        for (i = 0; i < instance.numAttributes(); i++) {

            // Save old value for the moment
            result[i] = instance.value(i);

            // only numeric attributes
            if (!instance.attribute(i).isNumeric()) {
                continue;
            }

            // out of range?
            if (!m_Cols.isInRange(i)) {
                continue;
            }

            // skip class?
            if ((instance.classIndex() == i) && (!m_IncludeClass)) {
                continue;
            }

            // too small?
            if (result[i] < m_MinThreshold) {
                if (getDebug()) {
                    System.out.println("Too small: " + result[i] + " -> " + m_MinDefault);
                }
                result[i] = m_MinDefault;
            }
            // too big?
            else if (result[i] > m_MaxThreshold) {
                if (getDebug()) {
                    System.out.println("Too big: " + result[i] + " -> " + m_MaxDefault);
                }
                result[i] = m_MaxDefault;
            }
            // too close?
            else if ((result[i] - m_CloseTo < m_CloseToTolerance) && (m_CloseTo - result[i] < m_CloseToTolerance) && (result[i] != m_CloseTo)) {
                if (getDebug()) {
                    System.out.println("Too close: " + result[i] + " -> " + m_CloseToDefault);
                }
                result[i] = m_CloseToDefault;
            }

            // decimals?
            if (m_Decimals > -1 && !Utils.isMissingValue(result[i])) {
                val = result[i];
                val = StrictMath.round(val * factor) / factor;
                result[i] = val;
            }
        }

        return instance.copy(result);
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String minThresholdTipText() {
        return "The minimum threshold below which values are replaced by the corresponding default.";
    }

    /**
     * Get the minimum threshold.
     * 
     * @return the minimum threshold.
     */
    public double getMinThreshold() {
        return m_MinThreshold;
    }

    /**
     * Set the minimum threshold.
     * 
     * @param value the minimum threshold to use.
     */
    public void setMinThreshold(double value) {
        m_MinThreshold = value;
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String minDefaultTipText() {
        return "The replacement for values smaller than the minimum threshold.";
    }

    /**
     * Get the minimum default.
     * 
     * @return the minimum default.
     */
    public double getMinDefault() {
        return m_MinDefault;
    }

    /**
     * Set the minimum default.
     * 
     * @param value the minimum default to use.
     */
    public void setMinDefault(double value) {
        m_MinDefault = value;
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String maxThresholdTipText() {
        return "The maximum threshold above which values are replaced by the corresponding default.";
    }

    /**
     * Get the maximum threshold.
     * 
     * @return the maximum threshold.
     */
    public double getMaxThreshold() {
        return m_MaxThreshold;
    }

    /**
     * Set the maximum threshold.
     * 
     * @param value the maximum threshold to use.
     */
    public void setMaxThreshold(double value) {
        m_MaxThreshold = value;
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String maxDefaultTipText() {
        return "The replacement for values larger than the maximum threshold.";
    }

    /**
     * Get the maximum default.
     * 
     * @return the maximum default.
     */
    public double getMaxDefault() {
        return m_MaxDefault;
    }

    /**
     * Set the naximum default.
     * 
     * @param value the maximum default to use.
     */
    public void setMaxDefault(double value) {
        m_MaxDefault = value;
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String closeToTipText() {
        return "The value with respect to which closeness is determined.";
    }

    /**
     * Get the "close to" number.
     * 
     * @return the "close to" number.
     */
    public double getCloseTo() {
        return m_CloseTo;
    }

    /**
     * Set the "close to" number.
     * 
     * @param value the number to use for checking closeness.
     */
    public void setCloseTo(double value) {
        m_CloseTo = value;
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String closeToDefaultTipText() {
        return "The replacement for values that are too close.";
    }

    /**
     * Get the "close to" default.
     * 
     * @return the "close to" default.
     */
    public double getCloseToDefault() {
        return m_CloseToDefault;
    }

    /**
     * Set the "close to" default.
     * 
     * @param value the "close to" default to use.
     */
    public void setCloseToDefault(double value) {
        m_CloseToDefault = value;
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String closeToToleranceTipText() {
        return "The tolerance for testing whether a value is too close.";
    }

    /**
     * Get the "close to" Tolerance.
     * 
     * @return the "close to" Tolerance.
     */
    public double getCloseToTolerance() {
        return m_CloseToTolerance;
    }

    /**
     * Set the "close to" Tolerance.
     * 
     * @param value the "close to" Tolerance to use.
     */
    public void setCloseToTolerance(double value) {
        m_CloseToTolerance = value;
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String attributeIndicesTipText() {
        return "The selection of columns to use in the cleansing process, first and last are valid indices.";
    }

    /**
     * Gets the selection of the columns, e.g., first-last or first-3,5-last
     * 
     * @return the selected indices
     */
    public String getAttributeIndices() {
        return m_Cols.getRanges();
    }

    /**
     * Sets the columns to use, e.g., first-last or first-3,5-last
     * 
     * @param value the columns to use
     */
    public void setAttributeIndices(String value) {
        m_Cols.setRanges(value);
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String invertSelectionTipText() {
        return "If enabled, the selection of the columns is inverted.";
    }

    /**
     * Gets whether the selection of the columns is inverted
     * 
     * @return true if the selection is inverted
     */
    public boolean getInvertSelection() {
        return m_Cols.getInvert();
    }

    /**
     * Sets whether the selection of the indices is inverted or not
     * 
     * @param value the new invert setting
     */
    public void setInvertSelection(boolean value) {
        m_Cols.setInvert(value);
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String includeClassTipText() {
        return "If disabled, the class attribute will be left out of the cleaning process.";
    }

    /**
     * Gets whether the class is included in the cleaning process or always skipped.
     * 
     * @return true if the class can be considered for cleaning.
     */
    public boolean getIncludeClass() {
        return m_IncludeClass;
    }

    /**
     * Sets whether the class can be cleaned, too.
     * 
     * @param value true if the class can be cleansed, too
     */
    public void setIncludeClass(boolean value) {
        m_IncludeClass = value;
    }

    /**
     * Returns the tip text for this property
     * 
     * @return tip text for this property suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String decimalsTipText() {
        return "The number of decimals to round to, -1 means no rounding at all.";
    }

    /**
     * Get the number of decimals to round to.
     * 
     * @return the number of decimals.
     */
    public int getDecimals() {
        return m_Decimals;
    }

    /**
     * Set the number of decimals to round to.
     * 
     * @param value the number of decimals.
     */
    public void setDecimals(int value) {
        m_Decimals = value;
    }

    /**
     * Runs the filter from commandline, use "-h" to see all options.
     * 
     * @param args the commandline options for the filter
     */
    public static void main(String[] args) {
        runFilter(new NumericCleaner(), args);
    }
}
